# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627

# DISCRIMINANT ANALYSIS – LDA and QDA

# Import necessary libraries
! pip install pandas;
! pip install numpy;
! pip install scikit-learn;
! pip install ISLP;

import pandas as pd 
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA 
from sklearn.model_selection import cross_val_predict 
from sklearn.metrics import confusion_matrix, accuracy_score

# Load dataset from the web site
# url = "http://fs2.american.edu/baron/www/627/R/Data_from_the_textbook/Auto.csv"
# Auto = pd.read_csv(url)

# or from the ISLP library
from ISLP import load_data
Auto = load_data('Auto')

# Define ECO ratings based on 'mpg'
Auto['mpg'].describe()

count    392.000000
mean      23.445918
std        7.805007
min        9.000000
25%       17.000000
50%       22.750000
75%       29.000000
max       46.600000
Name: mpg, dtype: float64

Auto['ECO'] = pd.cut(Auto['mpg'], bins=[-np.inf, 17.5, 23, 29, np.inf],
                   labels=['Heavy', 'OK', 'Economy', 'Excellent'])

# Check the distribution of ECO
print(Auto['ECO'].value_counts())

ECO
Heavy        104
OK           101
Excellent     95
Economy       92
Name: count, dtype: int64

# Ensure the target variable 'ECO' is treated as categorical
Auto['ECO'] = Auto['ECO'].astype('category')

# Prepare feature set (acceleration, year, horsepower, weight) and target (ECO)
X = pd.DataFrame(Auto[['acceleration', 'year', 'horsepower', 'weight']])
y = pd.Series(Auto['ECO'])

X.dtypes

acceleration    float64
year              int64
horsepower        int64
weight            int64
dtype: object

Auto['horsepower'] = Auto['horsepower'].replace('?', np.nan)

# Linear Discriminant Analysis (LDA)
lda = LDA()
lda.fit(X, y)

# Predict using the trained LDA model
y_pred_lda = lda.predict(X)

# Confusion matrix and accuracy for LDA
conf_matrix_lda = confusion_matrix(y, y_pred_lda)
accuracy_lda = accuracy_score(y, y_pred_lda)

print("LDA Confusion Matrix:")
print(conf_matrix_lda)
print("LDA Accuracy: {:.4f}".format(accuracy_lda))

LDA Confusion Matrix:
[[64 17  0 11]
 [12 81  0  2]
 [ 0  0 90 14]
 [21  1  4 75]]
LDA Accuracy: 0.7908

# LDA with specified prior probabilities
lda_prior = LDA(priors=[0.25, 0.25, 0.25, 0.25])
y_pred_lda_prior = cross_val_predict(lda_prior, X, y, cv=5)
lda_prior.fit(X, y)

# Confusion matrix and accuracy with specified priors
conf_matrix_lda_prior = confusion_matrix(y, y_pred_lda_prior)
accuracy_lda_prior = accuracy_score(y, y_pred_lda_prior)
print("LDA with Prior Confusion Matrix:")
print(conf_matrix_lda_prior)
print("LDA with Prior Accuracy: {:.4f}".format(accuracy_lda_prior))

LDA with Prior Confusion Matrix:
[[54 23  0 15]
 [20 73  0  2]
 [ 0  0 88 16]
 [22  3  4 72]]
LDA with Prior Accuracy: 0.7321

# Quadratic Discriminant Analysis (QDA)
qda = QDA(priors=[0.25, 0.25, 0.25, 0.25])
y_pred_qda = cross_val_predict(qda, X, y, cv=5)
qda.fit(X, y)

# Confusion matrix and accuracy for QDA
conf_matrix_qda = confusion_matrix(y, y_pred_qda)
accuracy_qda = accuracy_score(y, y_pred_qda)
print("QDA Confusion Matrix:")
print(conf_matrix_qda)
print("QDA Accuracy: {:.4f}".format(accuracy_qda))

QDA Confusion Matrix:
[[51 21  0 20]
 [25 68  0  2]
 [ 0  0 86 18]
 [23  0  4 74]]
QDA Accuracy: 0.7117

# Posterior probabilities for first 20 observations
lda_posterior = lda.predict_proba(X[:20])
print("LDA Posterior Probabilities (first 20 rows):")
print(pd.DataFrame(lda_posterior, columns=lda.classes_))

# Summing the posterior probabilities for each row (should add up to 1)
print("Row sums of posterior probabilities:")
print(np.sum(lda_posterior, axis=1))

LDA Posterior Probabilities (first 20 rows):
         Economy     Excellent         Heavy        OK
0   7.168583e-04  6.070189e-07  8.693823e-01  0.129900
1   3.912607e-05  1.434172e-08  9.914524e-01  0.008508
2   8.622131e-04  7.950708e-07  9.215251e-01  0.077612
3   1.030914e-03  8.918585e-07  9.157284e-01  0.083240
4   8.514916e-04  8.599892e-07  8.914880e-01  0.107660
5   9.967844e-09  4.559089e-13  9.999710e-01  0.000029
6   4.349652e-09  1.680365e-13  9.999910e-01  0.000009
7   7.284857e-09  3.453575e-13  9.999857e-01  0.000014
8   2.024389e-09  5.648859e-14  9.999952e-01  0.000005
9   2.355882e-06  5.395486e-10  9.991637e-01  0.000834
10  1.178452e-04  6.711477e-08  9.860518e-01  0.013830
11  6.617738e-05  4.181339e-08  9.851480e-01  0.014786
12  1.936976e-05  7.738634e-09  9.899570e-01  0.010024
13  6.428759e-03  8.812579e-06  9.732992e-01  0.020263
14  4.651891e-01  1.178638e-02  4.783595e-04  0.522546
15  8.572947e-02  5.480650e-04  1.428626e-02  0.899436
16  1.165114e-01  8.680732e-04  1.052362e-02  0.872097
17  2.107162e-01  2.914475e-03  1.625043e-03  0.784744
18  6.784941e-01  3.844641e-02  4.224588e-05  0.283017
19  7.864024e-01  1.003046e-01  2.624296e-07  0.113293
Row sums of posterior probabilities:
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]